import numpy as np
import matplotlib.pyplot as plt  # To visualize
import pandas as pd  # To read data
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from stargazer.stargazer import Stargazer
from IPython.core.display import HTML
import seaborn as sn
data = pd.read_stata('berkeley.dta', preserve_dtypes=False)
data['gender'].describe()
count     4526
unique       2
top       Male
freq      2691
Name: gender, dtype: object
gender = {'Male': 0,'Female': 1}
data.gender = [gender[item] for item in data.gender]
admit = {'Rejected': 0,'Admitted': 1}
data.admit = [admit[item] for item in data.admit]
dept = {'A': 0, 'B': 1, 'C':2, 'D': 3, 'E': 4, 'F': 5}
data.dept = [dept[item] for item in data.dept]
onlywomen = data.loc[data['gender'] == 1].copy()
onlywomen['admit'].describe()
count    1835.000000
mean        0.303542
std         0.459913
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         1.000000
Name: admit, dtype: float64
data.agg(
    {
        "gender": ["min", "max", "median", "skew"],
        "admit": ["min", "max", "median", "mean"],
    }
)
gender admit
min 0.000000 0.00000
max 1.000000 1.00000
median 0.000000 0.00000
skew 0.385339 NaN
mean NaN 0.38776
data.dtypes
applicant     int64
admit         int64
gender        int64
dept         object
dtype: object
LR = LinearRegression()  # create object for the class
X = data['gender'].values.reshape(-1, 1)  # values converts it into a numpy array
y = data['admit'].values.reshape(-1, 1)  # values converts it into a numpy array
LR.fit(X, y)
LinearRegression()
print('Coefficients: \n', LR.coef_)
Coefficients: 
 [[-0.14164543]]
X = sm.add_constant(X)
model = sm.OLS(data['admit'], data['gender']).fit()
model.summary()
OLS Regression Results
Dep. Variable: admit R-squared (uncentered): 0.096
Model: OLS Adj. R-squared (uncentered): 0.096
Method: Least Squares F-statistic: 482.4
Date: Sun, 10 Oct 2021 Prob (F-statistic): 1.11e-101
Time: 18:52:17 Log-Likelihood: -4049.0
No. Observations: 4526 AIC: 8100.
Df Residuals: 4525 BIC: 8106.
Df Model: 1
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
gender 0.3035 0.014 21.964 0.000 0.276 0.331
Omnibus: 24363.738 Durbin-Watson: 0.015
Prob(Omnibus): 0.000 Jarque-Bera (JB): 574.437
Skew: 0.385 Prob(JB): 1.83e-125
Kurtosis: 1.434 Cond. No. 1.00


Notes:
[1] R² is computed without centering (uncentered) since the model does not contain a constant.
[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.
data['dept'].unique()
array(['A', 'B', 'C', 'D', 'E', 'F'], dtype=object)
data.describe()
applicant admit gender dept
count 4526.000000 4526.000000 4526.000000 4526.000000
mean 2263.500000 0.387760 0.405435 2.364781
std 1306.687989 0.487293 0.491030 1.712402
min 1.000000 0.000000 0.000000 0.000000
25% 1132.250000 0.000000 0.000000 1.000000
50% 2263.500000 0.000000 0.000000 2.000000
75% 3394.750000 1.000000 1.000000 4.000000
max 4526.000000 1.000000 1.000000 5.000000
Income_Gini = sm.OLS(data['Gini'], sm.add_constant(data['Income_Per_Capita'])).fit()